In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

import folium
from folium.plugins import HeatMap
import plotly.express as px

plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)
In [2]:
df = pd.read_csv(r'C:\Users\hp\Documents\Projects\Booking Tickets\hotel_bookings.csv')
df.head()
Out[2]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03
In [3]:
df.describe()
Out[3]:
is_canceled lead_time arrival_date_year arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled booking_changes agent company days_in_waiting_list adr required_car_parking_spaces total_of_special_requests
count 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119386.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 103050.000000 6797.000000 119390.000000 119390.000000 119390.000000 119390.000000
mean 0.370416 104.011416 2016.156554 27.165173 15.798241 0.927599 2.500302 1.856403 0.103890 0.007949 0.031912 0.087118 0.137097 0.221124 86.693382 189.266735 2.321149 101.831122 0.062518 0.571363
std 0.482918 106.863097 0.707476 13.605138 8.780829 0.998613 1.908286 0.579261 0.398561 0.097436 0.175767 0.844336 1.497437 0.652306 110.774548 131.655015 17.594721 50.535790 0.245291 0.792798
min 0.000000 0.000000 2015.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 6.000000 0.000000 -6.380000 0.000000 0.000000
25% 0.000000 18.000000 2016.000000 16.000000 8.000000 0.000000 1.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9.000000 62.000000 0.000000 69.290000 0.000000 0.000000
50% 0.000000 69.000000 2016.000000 28.000000 16.000000 1.000000 2.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 179.000000 0.000000 94.575000 0.000000 0.000000
75% 1.000000 160.000000 2017.000000 38.000000 23.000000 2.000000 3.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 229.000000 270.000000 0.000000 126.000000 0.000000 1.000000
max 1.000000 737.000000 2017.000000 53.000000 31.000000 19.000000 50.000000 55.000000 10.000000 10.000000 1.000000 26.000000 72.000000 21.000000 535.000000 543.000000 391.000000 5400.000000 8.000000 5.000000

Removing NULL values

In [4]:
null = pd.DataFrame({'Null Values' : df.isna().sum(), 'Percentage Null Values' : (df.isna().sum()) / (df.shape[0]) * (100)})
null
Out[4]:
Null Values Percentage Null Values
hotel 0 0.000000
is_canceled 0 0.000000
lead_time 0 0.000000
arrival_date_year 0 0.000000
arrival_date_month 0 0.000000
arrival_date_week_number 0 0.000000
arrival_date_day_of_month 0 0.000000
stays_in_weekend_nights 0 0.000000
stays_in_week_nights 0 0.000000
adults 0 0.000000
children 4 0.003350
babies 0 0.000000
meal 0 0.000000
country 488 0.408744
market_segment 0 0.000000
distribution_channel 0 0.000000
is_repeated_guest 0 0.000000
previous_cancellations 0 0.000000
previous_bookings_not_canceled 0 0.000000
reserved_room_type 0 0.000000
assigned_room_type 0 0.000000
booking_changes 0 0.000000
deposit_type 0 0.000000
agent 16340 13.686238
company 112593 94.306893
days_in_waiting_list 0 0.000000
customer_type 0 0.000000
adr 0 0.000000
required_car_parking_spaces 0 0.000000
total_of_special_requests 0 0.000000
reservation_status 0 0.000000
reservation_status_date 0 0.000000
In [5]:
df.fillna(0, inplace = True)

Adults, babies and children cannot be zero at same time, so dropping the rows having all these zero at same time

In [6]:
filter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)
df[filter]
Out[6]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
2224 Resort Hotel 0 1 2015 October 41 6 0 3 0 0.0 0 SC PRT Corporate Corporate 0 0 0 A I 1 No Deposit 0.0 174.0 0 Transient-Party 0.00 0 0 Check-Out 2015-10-06
2409 Resort Hotel 0 0 2015 October 42 12 0 0 0 0.0 0 SC PRT Corporate Corporate 0 0 0 A I 0 No Deposit 0.0 174.0 0 Transient 0.00 0 0 Check-Out 2015-10-12
3181 Resort Hotel 0 36 2015 November 47 20 1 2 0 0.0 0 SC ESP Groups TA/TO 0 0 0 A C 0 No Deposit 38.0 0.0 0 Transient-Party 0.00 0 0 Check-Out 2015-11-23
3684 Resort Hotel 0 165 2015 December 53 30 1 4 0 0.0 0 SC PRT Groups TA/TO 0 0 0 A A 1 No Deposit 308.0 0.0 122 Transient-Party 0.00 0 0 Check-Out 2016-01-04
3708 Resort Hotel 0 165 2015 December 53 30 2 4 0 0.0 0 SC PRT Groups TA/TO 0 0 0 A C 1 No Deposit 308.0 0.0 122 Transient-Party 0.00 0 0 Check-Out 2016-01-05
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
115029 City Hotel 0 107 2017 June 26 27 0 3 0 0.0 0 BB CHE Online TA TA/TO 0 0 0 A A 1 No Deposit 7.0 0.0 0 Transient 100.80 0 0 Check-Out 2017-06-30
115091 City Hotel 0 1 2017 June 26 30 0 1 0 0.0 0 SC PRT Complementary Direct 0 0 0 E K 0 No Deposit 0.0 0.0 0 Transient 0.00 1 1 Check-Out 2017-07-01
116251 City Hotel 0 44 2017 July 28 15 1 1 0 0.0 0 SC SWE Online TA TA/TO 0 0 0 A K 2 No Deposit 425.0 0.0 0 Transient 73.80 0 0 Check-Out 2017-07-17
116534 City Hotel 0 2 2017 July 28 15 2 5 0 0.0 0 SC RUS Online TA TA/TO 0 0 0 A K 1 No Deposit 9.0 0.0 0 Transient-Party 22.86 0 1 Check-Out 2017-07-22
117087 City Hotel 0 170 2017 July 30 27 0 2 0 0.0 0 BB BRA Offline TA/TO TA/TO 0 0 0 A A 0 No Deposit 52.0 0.0 0 Transient 0.00 0 0 Check-Out 2017-07-29

180 rows × 32 columns

In [7]:
df = df[~filter]
df
Out[7]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.00 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.00 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.00 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.00 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.00 0 1 Check-Out 2015-07-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
119385 City Hotel 0 23 2017 August 35 30 2 5 2 0.0 0 BB BEL Offline TA/TO TA/TO 0 0 0 A A 0 No Deposit 394.0 0.0 0 Transient 96.14 0 0 Check-Out 2017-09-06
119386 City Hotel 0 102 2017 August 35 31 2 5 3 0.0 0 BB FRA Online TA TA/TO 0 0 0 E E 0 No Deposit 9.0 0.0 0 Transient 225.43 0 2 Check-Out 2017-09-07
119387 City Hotel 0 34 2017 August 35 31 2 5 2 0.0 0 BB DEU Online TA TA/TO 0 0 0 D D 0 No Deposit 9.0 0.0 0 Transient 157.71 0 4 Check-Out 2017-09-07
119388 City Hotel 0 109 2017 August 35 31 2 5 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 89.0 0.0 0 Transient 104.40 0 0 Check-Out 2017-09-07
119389 City Hotel 0 205 2017 August 35 29 2 7 2 0.0 0 HB DEU Online TA TA/TO 0 0 0 A A 0 No Deposit 9.0 0.0 0 Transient 151.20 0 2 Check-Out 2017-09-07

119210 rows × 32 columns

EDA

Country with most Guest

In [8]:
country_wise_guests = df[df['is_canceled'] == 0]['country'].value_counts().reset_index()
country_wise_guests.columns = ['country', 'No of guests']
country_wise_guests
Out[8]:
country No of guests
0 PRT 20977
1 GBR 9668
2 FRA 8468
3 ESP 6383
4 DEU 6067
... ... ...
161 ZMB 1
162 DJI 1
163 GUY 1
164 CYM 1
165 PYF 1

166 rows × 2 columns

In [9]:
basemap = folium.Map()
guests_map = px.choropleth(country_wise_guests, locations = country_wise_guests['country'],
color = country_wise_guests['No of guests'], hover_name = country_wise_guests['country'])
guests_map.show()
In [10]:
data = df[df['is_canceled'] == 0]

px.box(data_frame = data, x = 'reserved_room_type', y = 'adr', color = 'hotel', template = 'plotly_dark')

Average Price per Room depends on its type and standard deviation

Price Variation Per Night

In [11]:
data_resort = df[(df['hotel'] == 'Resort Hotel') & (df['is_canceled'] == 0)]
data_city = df[(df['hotel'] == 'City Hotel') & (df['is_canceled'] == 0)]
resort_hotel = data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel
Out[11]:
arrival_date_month adr
0 April 75.867816
1 August 181.205892
2 December 68.410104
3 February 54.147478
4 January 48.761125
5 July 150.122528
6 June 107.974850
7 March 57.056838
8 May 76.657558
9 November 48.706289
10 October 61.775449
11 September 96.416860
In [12]:
city_hotel=data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel
Out[12]:
arrival_date_month adr
0 April 111.962267
1 August 118.674598
2 December 88.401855
3 February 86.520062
4 January 82.330983
5 July 115.818019
6 June 117.874360
7 March 90.658533
8 May 120.669827
9 November 86.946592
10 October 102.004672
11 September 112.776582
In [13]:
final_hotel = resort_hotel.merge(city_hotel, on = 'arrival_date_month')
final_hotel.columns = ['month', 'price_for_resort', 'price_for_city_hotel']
final_hotel
Out[13]:
month price_for_resort price_for_city_hotel
0 April 75.867816 111.962267
1 August 181.205892 118.674598
2 December 68.410104 88.401855
3 February 54.147478 86.520062
4 January 48.761125 82.330983
5 July 150.122528 115.818019
6 June 107.974850 117.874360
7 March 57.056838 90.658533
8 May 76.657558 120.669827
9 November 48.706289 86.946592
10 October 61.775449 102.004672
11 September 96.416860 112.776582
In [14]:
import sort_dataframeby_monthorweek as sd

def sort_month(df, column_name):
    return sd.Sort_Dataframeby_Month(df, column_name)
In [15]:
final_prices = sort_month(final_hotel, 'month')
final_prices
Out[15]:
month price_for_resort price_for_city_hotel
0 January 48.761125 82.330983
1 February 54.147478 86.520062
2 March 57.056838 90.658533
3 April 75.867816 111.962267
4 May 76.657558 120.669827
5 June 107.974850 117.874360
6 July 150.122528 115.818019
7 August 181.205892 118.674598
8 September 96.416860 112.776582
9 October 61.775449 102.004672
10 November 48.706289 86.946592
11 December 68.410104 88.401855
In [16]:
plt.figure(figsize = (17, 8))

px.line(final_prices, x = 'month', y = ['price_for_resort','price_for_city_hotel'],
        title = 'Room price per night over the Months', template = 'plotly_dark')
<Figure size 1224x576 with 0 Axes>

Most busy month

In [17]:
resort_guests = data_resort['arrival_date_month'].value_counts().reset_index()
resort_guests.columns=['month','no of guests']
resort_guests
Out[17]:
month no of guests
0 August 3257
1 July 3137
2 October 2575
3 March 2571
4 April 2550
5 May 2535
6 February 2308
7 September 2102
8 June 2037
9 December 2014
10 November 1975
11 January 1866
In [18]:
city_guests = data_city['arrival_date_month'].value_counts().reset_index()
city_guests.columns=['month','no of guests']
city_guests
Out[18]:
month no of guests
0 August 5367
1 July 4770
2 May 4568
3 June 4358
4 October 4326
5 September 4283
6 March 4049
7 April 4010
8 February 3051
9 November 2676
10 December 2377
11 January 2249
In [19]:
final_guests = resort_guests.merge(city_guests,on='month')
final_guests.columns=['month','no of guests in resort','no of guest in city hotel']
final_guests
Out[19]:
month no of guests in resort no of guest in city hotel
0 August 3257 5367
1 July 3137 4770
2 October 2575 4326
3 March 2571 4049
4 April 2550 4010
5 May 2535 4568
6 February 2308 3051
7 September 2102 4283
8 June 2037 4358
9 December 2014 2377
10 November 1975 2676
11 January 1866 2249
In [20]:
final_guests = sort_month(final_guests,'month')
final_guests
Out[20]:
month no of guests in resort no of guest in city hotel
0 January 1866 2249
1 February 2308 3051
2 March 2571 4049
3 April 2550 4010
4 May 2535 4568
5 June 2037 4358
6 July 3137 4770
7 August 3257 5367
8 September 2102 4283
9 October 2575 4326
10 November 1975 2676
11 December 2014 2377
In [21]:
px.line(final_guests, x = 'month', y = ['no of guests in resort','no of guest in city hotel'],
        title='Total no of guests per Months', template = 'plotly_dark')

Duration of Stay

In [22]:
filter = df['is_canceled'] == 0
data = df[filter]
data.head()
Out[22]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.0 0 1 Check-Out 2015-07-03
In [23]:
data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']
data.head()
Out[23]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel ... previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date total_nights
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct ... 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01 0
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct ... 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01 0
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct ... 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02 1
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate ... 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02 1
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO ... 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.0 0 1 Check-Out 2015-07-03 2

5 rows × 33 columns

In [24]:
stay = data.groupby(['total_nights', 'hotel']).agg('count').reset_index()
stay = stay.iloc[:, :3]
stay = stay.rename(columns={'is_canceled':'Number of stays'})
stay
Out[24]:
total_nights hotel Number of stays
0 0 City Hotel 251
1 0 Resort Hotel 371
2 1 City Hotel 9155
3 1 Resort Hotel 6579
4 2 City Hotel 10983
... ... ... ...
57 46 Resort Hotel 1
58 48 City Hotel 1
59 56 Resort Hotel 1
60 60 Resort Hotel 1
61 69 Resort Hotel 1

62 rows × 3 columns

In [25]:
px.bar(data_frame = stay, x = 'total_nights', y = 'Number of stays', color = 'hotel', barmode = 'group',
        template = 'plotly_dark')

Data Pre Processing

In [26]:
plt.figure(figsize = (24, 12))

corr = df.corr()
sns.heatmap(corr, annot = True, linewidths = 1)
plt.show()
In [27]:
correlation = df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation
Out[27]:
is_canceled                       1.000000
lead_time                         0.292876
total_of_special_requests         0.234877
required_car_parking_spaces       0.195701
booking_changes                   0.144832
previous_cancellations            0.110139
is_repeated_guest                 0.083745
company                           0.083594
adults                            0.058182
previous_bookings_not_canceled    0.057365
days_in_waiting_list              0.054301
agent                             0.046770
adr                               0.046492
babies                            0.032569
stays_in_week_nights              0.025542
arrival_date_year                 0.016622
arrival_date_week_number          0.008315
arrival_date_day_of_month         0.005948
children                          0.004851
stays_in_weekend_nights           0.001323
Name: is_canceled, dtype: float64

Drop columns that are not useful

In [28]:
useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
               'reservation_status', 'country', 'days_in_waiting_list']

df.drop(useless_col, axis = 1, inplace = True)
In [29]:
df.head()
Out[29]:
hotel is_canceled lead_time arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type deposit_type agent company customer_type adr required_car_parking_spaces total_of_special_requests reservation_status_date
0 Resort Hotel 0 342 July 27 1 0 0 2 0.0 0 BB Direct Direct 0 0 0 C No Deposit 0.0 0.0 Transient 0.0 0 0 2015-07-01
1 Resort Hotel 0 737 July 27 1 0 0 2 0.0 0 BB Direct Direct 0 0 0 C No Deposit 0.0 0.0 Transient 0.0 0 0 2015-07-01
2 Resort Hotel 0 7 July 27 1 0 1 1 0.0 0 BB Direct Direct 0 0 0 A No Deposit 0.0 0.0 Transient 75.0 0 0 2015-07-02
3 Resort Hotel 0 13 July 27 1 0 1 1 0.0 0 BB Corporate Corporate 0 0 0 A No Deposit 304.0 0.0 Transient 75.0 0 0 2015-07-02
4 Resort Hotel 0 14 July 27 1 0 2 2 0.0 0 BB Online TA TA/TO 0 0 0 A No Deposit 240.0 0.0 Transient 98.0 0 1 2015-07-03

Create numerical and categorical dataframes

In [30]:
cat_cols = [col for col in df.columns if df[col].dtype == 'O']
cat_cols
Out[30]:
['hotel',
 'arrival_date_month',
 'meal',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type',
 'reservation_status_date']
In [31]:
cat_df = df[cat_cols]
cat_df.head()
Out[31]:
hotel arrival_date_month meal market_segment distribution_channel reserved_room_type deposit_type customer_type reservation_status_date
0 Resort Hotel July BB Direct Direct C No Deposit Transient 2015-07-01
1 Resort Hotel July BB Direct Direct C No Deposit Transient 2015-07-01
2 Resort Hotel July BB Direct Direct A No Deposit Transient 2015-07-02
3 Resort Hotel July BB Corporate Corporate A No Deposit Transient 2015-07-02
4 Resort Hotel July BB Online TA TA/TO A No Deposit Transient 2015-07-03
In [32]:
cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])

cat_df['year'] = cat_df['reservation_status_date'].dt.year
cat_df['month'] = cat_df['reservation_status_date'].dt.month
cat_df['day'] = cat_df['reservation_status_date'].dt.day
In [35]:
cat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
In [36]:
cat_df.head()
Out[36]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day
0 Resort Hotel BB Direct Direct C No Deposit Transient 2015 7 1
1 Resort Hotel BB Direct Direct C No Deposit Transient 2015 7 1
2 Resort Hotel BB Direct Direct A No Deposit Transient 2015 7 2
3 Resort Hotel BB Corporate Corporate A No Deposit Transient 2015 7 2
4 Resort Hotel BB Online TA TA/TO A No Deposit Transient 2015 7 3

print unique values of each column

In [37]:
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")
hotel: 
['Resort Hotel' 'City Hotel']

meal: 
['BB' 'FB' 'HB' 'SC' 'Undefined']

market_segment: 
['Direct' 'Corporate' 'Online TA' 'Offline TA/TO' 'Complementary' 'Groups'
 'Undefined' 'Aviation']

distribution_channel: 
['Direct' 'Corporate' 'TA/TO' 'Undefined' 'GDS']

reserved_room_type: 
['C' 'A' 'D' 'E' 'G' 'F' 'H' 'L' 'B']

deposit_type: 
['No Deposit' 'Refundable' 'Non Refund']

customer_type: 
['Transient' 'Contract' 'Transient-Party' 'Group']

year: 
[2015 2014 2016 2017]

month: 
[ 7  5  4  6  3  8  9  1 11 10 12  2]

day: 
[ 1  2  3  6 22 23  5  7  8 11 15 16 29 19 18  9 13  4 12 26 17 10 20 14
 30 28 25 21 27 24 31]

encode categorical variables

In [38]:
cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})

cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})

cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
                                                           'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})

cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
                                                                       'GDS': 4})

cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                                                   'L': 7, 'B': 8})

cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})

cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})

cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})
cat_df.head()
Out[38]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day
0 0 0 0 0 0 0 0 0 7 1
1 0 0 0 0 0 0 0 0 7 1
2 0 0 0 0 1 0 0 0 7 2
3 0 0 1 1 1 0 0 0 7 2
4 0 0 2 2 1 0 0 0 7 3
In [39]:
num_df = df.drop(columns = cat_cols, axis = 1)
num_df.drop('is_canceled', axis = 1, inplace = True)
num_df
Out[39]:
lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
0 342 27 1 0 0 2 0.0 0 0 0 0 0.0 0.0 0.00 0 0
1 737 27 1 0 0 2 0.0 0 0 0 0 0.0 0.0 0.00 0 0
2 7 27 1 0 1 1 0.0 0 0 0 0 0.0 0.0 75.00 0 0
3 13 27 1 0 1 1 0.0 0 0 0 0 304.0 0.0 75.00 0 0
4 14 27 1 0 2 2 0.0 0 0 0 0 240.0 0.0 98.00 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
119385 23 35 30 2 5 2 0.0 0 0 0 0 394.0 0.0 96.14 0 0
119386 102 35 31 2 5 3 0.0 0 0 0 0 9.0 0.0 225.43 0 2
119387 34 35 31 2 5 2 0.0 0 0 0 0 9.0 0.0 157.71 0 4
119388 109 35 31 2 5 2 0.0 0 0 0 0 89.0 0.0 104.40 0 0
119389 205 35 29 2 7 2 0.0 0 0 0 0 9.0 0.0 151.20 0 2

119210 rows × 16 columns

In [40]:
num_df.var()
Out[40]:
lead_time                         11422.361808
arrival_date_week_number            184.990111
arrival_date_day_of_month            77.107192
stays_in_weekend_nights               0.990258
stays_in_week_nights                  3.599010
adults                                0.330838
children                              0.159070
babies                                0.009508
is_repeated_guest                     0.030507
previous_cancellations                0.713887
previous_bookings_not_canceled        2.244415
agent                             11485.169679
company                            2897.684308
adr                                2543.589039
required_car_parking_spaces           0.060201
total_of_special_requests             0.628652
dtype: float64

Normalizing numerical variables

In [41]:
num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
num_df['agent'] = np.log(num_df['agent'] + 1)
num_df['company'] = np.log(num_df['company'] + 1)
num_df['adr'] = np.log(num_df['adr'] + 1)
In [42]:
num_df.var()
Out[42]:
lead_time                         2.582757
arrival_date_week_number          0.440884
arrival_date_day_of_month         0.506325
stays_in_weekend_nights           0.990258
stays_in_week_nights              3.599010
adults                            0.330838
children                          0.159070
babies                            0.009508
is_repeated_guest                 0.030507
previous_cancellations            0.713887
previous_bookings_not_canceled    2.244415
agent                             3.535793
company                           1.346883
adr                               0.515480
required_car_parking_spaces       0.060201
total_of_special_requests         0.628652
dtype: float64
In [43]:
num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
num_df.head()
Out[43]:
lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
0 5.837730 3.332205 0.693147 0 0 2 0.0 0 0 0 0 0.000000 0.0 0.000000 0 0
1 6.603944 3.332205 0.693147 0 0 2 0.0 0 0 0 0 0.000000 0.0 0.000000 0 0
2 2.079442 3.332205 0.693147 0 1 1 0.0 0 0 0 0 0.000000 0.0 4.330733 0 0
3 2.639057 3.332205 0.693147 0 1 1 0.0 0 0 0 0 5.720312 0.0 4.330733 0 0
4 2.708050 3.332205 0.693147 0 2 2 0.0 0 0 0 0 5.484797 0.0 4.595120 0 1
In [44]:
X = pd.concat([cat_df, num_df], axis = 1)
y = df['is_canceled']
X.shape, y.shape
Out[44]:
((119210, 26), (119210,))

Splitting data into training set and test set

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

X_train.head()
Out[45]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
97762 1 0 2 2 2 0 0 2 9 20 0.000000 3.688879 3.044522 0 0 2 0.0 0 1 0 0 2.302585 0.000000 0.000000 0 1
83513 1 0 1 1 1 0 2 2 2 6 3.465736 1.945910 1.609438 0 2 2 0.0 0 0 0 0 0.000000 5.164786 4.412798 0 0
7402 0 0 2 2 3 0 0 2 1 19 5.342334 3.526361 2.079442 2 5 2 0.0 0 0 0 0 5.484797 0.000000 5.065755 0 2
116739 1 0 2 2 2 0 0 3 7 24 4.934474 3.401197 3.044522 1 3 3 0.0 0 0 0 0 2.302585 0.000000 5.160491 0 1
10464 0 4 5 2 1 3 0 2 12 13 4.465908 2.397895 2.302585 0 3 2 0.0 0 0 0 0 4.219508 0.000000 3.850148 0 0
In [46]:
X_test.head()
Out[46]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
9743 0 0 3 2 2 0 0 2 10 25 4.204693 3.988984 3.433987 0 2 2 0.0 0 0 0 0 1.791759 0.0 5.093750 0 1
44448 1 0 5 2 1 3 0 0 7 3 4.584967 3.737670 1.791759 1 3 1 0.0 0 0 0 0 3.401197 0.0 4.948760 0 0
96364 1 3 2 2 1 0 0 2 9 2 2.564949 3.610918 3.465736 0 2 2 0.0 0 0 0 0 2.302585 0.0 4.783316 0 1
59463 1 0 5 2 1 3 0 2 3 15 5.513429 3.806662 3.367296 1 2 2 0.0 0 0 0 0 3.637586 0.0 4.709530 0 0
65942 1 0 2 2 1 0 0 3 3 16 4.007333 2.772589 2.397895 1 3 2 0.0 0 0 0 0 2.302585 0.0 4.844187 0 0
In [47]:
y_train.head(), y_test.head()
Out[47]:
(97762     0
 83513     0
 7402      1
 116739    0
 10464     1
 Name: is_canceled, dtype: int64,
 9743     1
 44448    1
 96364    0
 59463    1
 65942    1
 Name: is_canceled, dtype: int64)

Model Building

  1. Logistic Regression
In [49]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)

print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Logistic Regression is : 0.8087688393031904
Confusion Matrix : 
[[21159  1279]
 [ 5560  7765]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.79      0.94      0.86     22438
           1       0.86      0.58      0.69     13325

    accuracy                           0.81     35763
   macro avg       0.83      0.76      0.78     35763
weighted avg       0.82      0.81      0.80     35763

  1. KNN
In [50]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

acc_knn = accuracy_score(y_test, y_pred_knn)
conf = confusion_matrix(y_test, y_pred_knn)
clf_report = classification_report(y_test, y_pred_knn)

print(f"Accuracy Score of KNN is : {acc_knn}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of KNN is : 0.8891871487291335
Confusion Matrix : 
[[21639   799]
 [ 3164 10161]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.87      0.96      0.92     22438
           1       0.93      0.76      0.84     13325

    accuracy                           0.89     35763
   macro avg       0.90      0.86      0.88     35763
weighted avg       0.89      0.89      0.89     35763

  1. Decision Tree Classifier
In [51]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
conf = confusion_matrix(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)

print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Decision Tree is : 0.9484103682576965
Confusion Matrix : 
[[21569   869]
 [  976 12349]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22438
           1       0.93      0.93      0.93     13325

    accuracy                           0.95     35763
   macro avg       0.95      0.94      0.94     35763
weighted avg       0.95      0.95      0.95     35763

  1. Random Forest Classifier
In [52]:
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)

y_pred_rd_clf = rd_clf.predict(X_test)

acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)
conf = confusion_matrix(y_test, y_pred_rd_clf)
clf_report = classification_report(y_test, y_pred_rd_clf)

print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Random Forest is : 0.9539747784022593
Confusion Matrix : 
[[22279   159]
 [ 1487 11838]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     22438
           1       0.99      0.89      0.93     13325

    accuracy                           0.95     35763
   macro avg       0.96      0.94      0.95     35763
weighted avg       0.96      0.95      0.95     35763

  1. Ada Boost Classifier
In [53]:
ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)

y_pred_ada = ada.predict(X_test)

acc_ada = accuracy_score(y_test, y_pred_ada)
conf = confusion_matrix(y_test, y_pred_ada)
clf_report = classification_report(y_test, y_pred_ada)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_ada}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9482985208176048
Confusion Matrix : 
[[21565   873]
 [  976 12349]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22438
           1       0.93      0.93      0.93     13325

    accuracy                           0.95     35763
   macro avg       0.95      0.94      0.94     35763
weighted avg       0.95      0.95      0.95     35763

  1. Gradient Boosting Classifier
In [58]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

acc_gb = accuracy_score(y_test, y_pred_gb)
conf = confusion_matrix(y_test, y_pred_gb)
clf_report = classification_report(y_test, y_pred_gb)

print(f"Accuracy Score of Gradient Boosting Classifier is : {acc_gb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Gradient Boosting Classifier is : 0.9073064340239912
Confusion Matrix : 
[[22162   276]
 [ 3039 10286]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.88      0.99      0.93     22438
           1       0.97      0.77      0.86     13325

    accuracy                           0.91     35763
   macro avg       0.93      0.88      0.90     35763
weighted avg       0.91      0.91      0.90     35763

  1. XgBoost Classifier
In [59]:
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
conf = confusion_matrix(y_test, y_pred_xgb)
clf_report = classification_report(y_test, y_pred_xgb)

print(f"Accuracy Score of XgBoost Classifier is : {acc_xgb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
[17:43:57] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Accuracy Score of XgBoost Classifier is : 0.9801470793837206
Confusion Matrix : 
[[22425    13]
 [  697 12628]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22438
           1       1.00      0.95      0.97     13325

    accuracy                           0.98     35763
   macro avg       0.98      0.97      0.98     35763
weighted avg       0.98      0.98      0.98     35763

  1. Cat Boost Classifier
In [56]:
cat = CatBoostClassifier(iterations=100)
cat.fit(X_train, y_train)

y_pred_cat = cat.predict(X_test)

acc_cat = accuracy_score(y_test, y_pred_cat)
conf = confusion_matrix(y_test, y_pred_cat)
clf_report = classification_report(y_test, y_pred_cat)
Learning rate set to 0.5
0:	learn: 0.4770269	total: 229ms	remaining: 22.7s
1:	learn: 0.4089092	total: 251ms	remaining: 12.3s
2:	learn: 0.3866005	total: 276ms	remaining: 8.94s
3:	learn: 0.3726167	total: 299ms	remaining: 7.17s
4:	learn: 0.3324213	total: 320ms	remaining: 6.08s
5:	learn: 0.3097578	total: 341ms	remaining: 5.34s
6:	learn: 0.2620394	total: 360ms	remaining: 4.78s
7:	learn: 0.2406249	total: 379ms	remaining: 4.35s
8:	learn: 0.2303044	total: 398ms	remaining: 4.02s
9:	learn: 0.1966051	total: 420ms	remaining: 3.78s
10:	learn: 0.1751360	total: 444ms	remaining: 3.59s
11:	learn: 0.1603410	total: 463ms	remaining: 3.4s
12:	learn: 0.1523437	total: 483ms	remaining: 3.23s
13:	learn: 0.1349777	total: 503ms	remaining: 3.09s
14:	learn: 0.1312845	total: 521ms	remaining: 2.95s
15:	learn: 0.1222717	total: 540ms	remaining: 2.83s
16:	learn: 0.1193100	total: 558ms	remaining: 2.73s
17:	learn: 0.1133295	total: 578ms	remaining: 2.63s
18:	learn: 0.1063014	total: 597ms	remaining: 2.54s
19:	learn: 0.1051583	total: 621ms	remaining: 2.48s
20:	learn: 0.1034562	total: 642ms	remaining: 2.42s
21:	learn: 0.0991366	total: 670ms	remaining: 2.37s
22:	learn: 0.0943621	total: 692ms	remaining: 2.32s
23:	learn: 0.0907212	total: 715ms	remaining: 2.26s
24:	learn: 0.0874195	total: 737ms	remaining: 2.21s
25:	learn: 0.0821334	total: 758ms	remaining: 2.16s
26:	learn: 0.0798692	total: 777ms	remaining: 2.1s
27:	learn: 0.0770581	total: 797ms	remaining: 2.05s
28:	learn: 0.0764868	total: 824ms	remaining: 2.02s
29:	learn: 0.0743634	total: 843ms	remaining: 1.97s
30:	learn: 0.0743365	total: 860ms	remaining: 1.91s
31:	learn: 0.0718587	total: 880ms	remaining: 1.87s
32:	learn: 0.0697601	total: 900ms	remaining: 1.83s
33:	learn: 0.0688807	total: 920ms	remaining: 1.79s
34:	learn: 0.0683348	total: 944ms	remaining: 1.75s
35:	learn: 0.0657610	total: 969ms	remaining: 1.72s
36:	learn: 0.0647132	total: 990ms	remaining: 1.69s
37:	learn: 0.0626886	total: 1.01s	remaining: 1.65s
38:	learn: 0.0619095	total: 1.04s	remaining: 1.62s
39:	learn: 0.0599795	total: 1.06s	remaining: 1.59s
40:	learn: 0.0590879	total: 1.08s	remaining: 1.55s
41:	learn: 0.0566762	total: 1.1s	remaining: 1.52s
42:	learn: 0.0555075	total: 1.12s	remaining: 1.48s
43:	learn: 0.0541327	total: 1.14s	remaining: 1.45s
44:	learn: 0.0529204	total: 1.16s	remaining: 1.42s
45:	learn: 0.0511744	total: 1.19s	remaining: 1.39s
46:	learn: 0.0501543	total: 1.21s	remaining: 1.36s
47:	learn: 0.0485847	total: 1.23s	remaining: 1.33s
48:	learn: 0.0485746	total: 1.25s	remaining: 1.3s
49:	learn: 0.0475529	total: 1.27s	remaining: 1.27s
50:	learn: 0.0470056	total: 1.3s	remaining: 1.25s
51:	learn: 0.0467095	total: 1.32s	remaining: 1.22s
52:	learn: 0.0449441	total: 1.35s	remaining: 1.2s
53:	learn: 0.0430612	total: 1.37s	remaining: 1.17s
54:	learn: 0.0421289	total: 1.4s	remaining: 1.14s
55:	learn: 0.0404613	total: 1.42s	remaining: 1.11s
56:	learn: 0.0400471	total: 1.44s	remaining: 1.08s
57:	learn: 0.0400424	total: 1.46s	remaining: 1.05s
58:	learn: 0.0394357	total: 1.48s	remaining: 1.02s
59:	learn: 0.0386042	total: 1.5s	remaining: 997ms
60:	learn: 0.0382579	total: 1.51s	remaining: 969ms
61:	learn: 0.0380642	total: 1.53s	remaining: 941ms
62:	learn: 0.0371938	total: 1.56s	remaining: 915ms
63:	learn: 0.0355726	total: 1.58s	remaining: 889ms
64:	learn: 0.0352402	total: 1.6s	remaining: 862ms
65:	learn: 0.0352402	total: 1.61s	remaining: 831ms
66:	learn: 0.0352350	total: 1.63s	remaining: 804ms
67:	learn: 0.0344630	total: 1.65s	remaining: 778ms
68:	learn: 0.0341551	total: 1.67s	remaining: 751ms
69:	learn: 0.0341551	total: 1.68s	remaining: 722ms
70:	learn: 0.0331507	total: 1.71s	remaining: 697ms
71:	learn: 0.0326697	total: 1.73s	remaining: 672ms
72:	learn: 0.0323195	total: 1.75s	remaining: 647ms
73:	learn: 0.0303645	total: 1.78s	remaining: 624ms
74:	learn: 0.0288498	total: 1.8s	remaining: 601ms
75:	learn: 0.0284408	total: 1.82s	remaining: 576ms
76:	learn: 0.0275904	total: 1.85s	remaining: 552ms
77:	learn: 0.0272158	total: 1.87s	remaining: 528ms
78:	learn: 0.0265226	total: 1.89s	remaining: 503ms
79:	learn: 0.0263916	total: 1.91s	remaining: 478ms
80:	learn: 0.0263916	total: 1.92s	remaining: 451ms
81:	learn: 0.0259878	total: 1.94s	remaining: 426ms
82:	learn: 0.0247036	total: 1.97s	remaining: 403ms
83:	learn: 0.0236941	total: 1.99s	remaining: 379ms
84:	learn: 0.0235769	total: 2.01s	remaining: 354ms
85:	learn: 0.0234891	total: 2.02s	remaining: 330ms
86:	learn: 0.0233983	total: 2.05s	remaining: 306ms
87:	learn: 0.0229775	total: 2.07s	remaining: 282ms
88:	learn: 0.0222823	total: 2.09s	remaining: 258ms
89:	learn: 0.0218272	total: 2.12s	remaining: 235ms
90:	learn: 0.0213133	total: 2.14s	remaining: 212ms
91:	learn: 0.0209254	total: 2.17s	remaining: 188ms
92:	learn: 0.0194585	total: 2.19s	remaining: 165ms
93:	learn: 0.0193712	total: 2.21s	remaining: 141ms
94:	learn: 0.0191785	total: 2.23s	remaining: 117ms
95:	learn: 0.0183353	total: 2.25s	remaining: 93.8ms
96:	learn: 0.0179554	total: 2.27s	remaining: 70.3ms
97:	learn: 0.0179211	total: 2.29s	remaining: 46.8ms
98:	learn: 0.0175292	total: 2.32s	remaining: 23.4ms
99:	learn: 0.0171780	total: 2.34s	remaining: 0us
In [60]:
print(f"Accuracy Score of Cat Boost Classifier is : {acc_cat}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Cat Boost Classifier is : 0.9939882000950703
Confusion Matrix : 
[[22425    13]
 [  697 12628]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22438
           1       1.00      0.95      0.97     13325

    accuracy                           0.98     35763
   macro avg       0.98      0.97      0.98     35763
weighted avg       0.98      0.98      0.98     35763

  1. Extra Trees Classifier
In [62]:
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

y_pred_etc = etc.predict(X_test)

acc_etc = accuracy_score(y_test, y_pred_etc)
conf = confusion_matrix(y_test, y_pred_etc)
clf_report = classification_report(y_test, y_pred_etc)

print(f"Accuracy Score of Extra Trees Classifier is : {acc_etc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Extra Trees Classifier is : 0.9514582110001958
Confusion Matrix : 
[[22221   217]
 [ 1519 11806]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     22438
           1       0.98      0.89      0.93     13325

    accuracy                           0.95     35763
   macro avg       0.96      0.94      0.95     35763
weighted avg       0.95      0.95      0.95     35763

  1. LGBM Classifier
In [65]:
lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)

acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
conf = confusion_matrix(y_test, y_pred_lgbm)
clf_report = classification_report(y_test, y_pred_lgbm)

print(f"Accuracy Score of LGBM Classifier is : {acc_lgbm}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of LGBM Classifier is : 0.9629784973296424
Confusion Matrix : 
[[21866   572]
 [  752 12573]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     22438
           1       0.96      0.94      0.95     13325

    accuracy                           0.96     35763
   macro avg       0.96      0.96      0.96     35763
weighted avg       0.96      0.96      0.96     35763

  1. Voting Classifier
In [64]:
classifiers = [('Gradient Boosting Classifier', gb), ('Cat Boost Classifier', cat), ('XGboost', xgb),  ('Decision Tree', dtc),
               ('Extra Tree', etc), ('Light Gradient', lgbm), ('Random Forest', rd_clf), ('Ada Boost', ada), ('Logistic', lr),
               ('Knn', knn)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)
Learning rate set to 0.5
0:	learn: 0.4770269	total: 21.4ms	remaining: 2.12s
1:	learn: 0.4089092	total: 42.5ms	remaining: 2.08s
2:	learn: 0.3866005	total: 64.8ms	remaining: 2.09s
3:	learn: 0.3726167	total: 85.8ms	remaining: 2.06s
4:	learn: 0.3324213	total: 111ms	remaining: 2.12s
5:	learn: 0.3097578	total: 135ms	remaining: 2.12s
6:	learn: 0.2620394	total: 160ms	remaining: 2.12s
7:	learn: 0.2406249	total: 184ms	remaining: 2.11s
8:	learn: 0.2303044	total: 206ms	remaining: 2.08s
9:	learn: 0.1966051	total: 226ms	remaining: 2.04s
10:	learn: 0.1751360	total: 248ms	remaining: 2.01s
11:	learn: 0.1603410	total: 273ms	remaining: 2s
12:	learn: 0.1523437	total: 296ms	remaining: 1.98s
13:	learn: 0.1349777	total: 324ms	remaining: 1.99s
14:	learn: 0.1312845	total: 345ms	remaining: 1.95s
15:	learn: 0.1222717	total: 367ms	remaining: 1.93s
16:	learn: 0.1193100	total: 396ms	remaining: 1.94s
17:	learn: 0.1133295	total: 421ms	remaining: 1.92s
18:	learn: 0.1063014	total: 442ms	remaining: 1.89s
19:	learn: 0.1051583	total: 462ms	remaining: 1.85s
20:	learn: 0.1034562	total: 481ms	remaining: 1.81s
21:	learn: 0.0991366	total: 500ms	remaining: 1.77s
22:	learn: 0.0943621	total: 522ms	remaining: 1.75s
23:	learn: 0.0907212	total: 542ms	remaining: 1.72s
24:	learn: 0.0874195	total: 564ms	remaining: 1.69s
25:	learn: 0.0821334	total: 610ms	remaining: 1.74s
26:	learn: 0.0798692	total: 659ms	remaining: 1.78s
27:	learn: 0.0770581	total: 693ms	remaining: 1.78s
28:	learn: 0.0764868	total: 731ms	remaining: 1.79s
29:	learn: 0.0743634	total: 771ms	remaining: 1.8s
30:	learn: 0.0743365	total: 797ms	remaining: 1.77s
31:	learn: 0.0718587	total: 826ms	remaining: 1.75s
32:	learn: 0.0697601	total: 854ms	remaining: 1.73s
33:	learn: 0.0688807	total: 878ms	remaining: 1.71s
34:	learn: 0.0683348	total: 901ms	remaining: 1.67s
35:	learn: 0.0657610	total: 926ms	remaining: 1.65s
36:	learn: 0.0647132	total: 955ms	remaining: 1.63s
37:	learn: 0.0626886	total: 979ms	remaining: 1.6s
38:	learn: 0.0619095	total: 1000ms	remaining: 1.56s
39:	learn: 0.0599795	total: 1.02s	remaining: 1.53s
40:	learn: 0.0590879	total: 1.04s	remaining: 1.5s
41:	learn: 0.0566762	total: 1.06s	remaining: 1.47s
42:	learn: 0.0555075	total: 1.08s	remaining: 1.43s
43:	learn: 0.0541327	total: 1.1s	remaining: 1.4s
44:	learn: 0.0529204	total: 1.12s	remaining: 1.37s
45:	learn: 0.0511744	total: 1.15s	remaining: 1.34s
46:	learn: 0.0501543	total: 1.17s	remaining: 1.32s
47:	learn: 0.0485847	total: 1.19s	remaining: 1.29s
48:	learn: 0.0485746	total: 1.21s	remaining: 1.26s
49:	learn: 0.0475529	total: 1.23s	remaining: 1.23s
50:	learn: 0.0470056	total: 1.25s	remaining: 1.2s
51:	learn: 0.0467095	total: 1.27s	remaining: 1.17s
52:	learn: 0.0449441	total: 1.29s	remaining: 1.14s
53:	learn: 0.0430612	total: 1.31s	remaining: 1.12s
54:	learn: 0.0421289	total: 1.33s	remaining: 1.09s
55:	learn: 0.0404613	total: 1.35s	remaining: 1.06s
56:	learn: 0.0400471	total: 1.38s	remaining: 1.04s
57:	learn: 0.0400424	total: 1.4s	remaining: 1.01s
58:	learn: 0.0394357	total: 1.42s	remaining: 987ms
59:	learn: 0.0386042	total: 1.44s	remaining: 962ms
60:	learn: 0.0382579	total: 1.47s	remaining: 940ms
61:	learn: 0.0380642	total: 1.49s	remaining: 915ms
62:	learn: 0.0371938	total: 1.52s	remaining: 891ms
63:	learn: 0.0355726	total: 1.54s	remaining: 869ms
64:	learn: 0.0352402	total: 1.57s	remaining: 846ms
65:	learn: 0.0352402	total: 1.58s	remaining: 816ms
66:	learn: 0.0352350	total: 1.6s	remaining: 789ms
67:	learn: 0.0344630	total: 1.62s	remaining: 764ms
68:	learn: 0.0341551	total: 1.64s	remaining: 738ms
69:	learn: 0.0341551	total: 1.66s	remaining: 710ms
70:	learn: 0.0331507	total: 1.68s	remaining: 686ms
71:	learn: 0.0326697	total: 1.7s	remaining: 661ms
72:	learn: 0.0323195	total: 1.72s	remaining: 638ms
73:	learn: 0.0303645	total: 1.75s	remaining: 615ms
74:	learn: 0.0288498	total: 1.77s	remaining: 590ms
75:	learn: 0.0284408	total: 1.79s	remaining: 565ms
76:	learn: 0.0275904	total: 1.81s	remaining: 540ms
77:	learn: 0.0272158	total: 1.83s	remaining: 515ms
78:	learn: 0.0265226	total: 1.85s	remaining: 491ms
79:	learn: 0.0263916	total: 1.86s	remaining: 466ms
80:	learn: 0.0263916	total: 1.88s	remaining: 441ms
81:	learn: 0.0259878	total: 1.9s	remaining: 418ms
82:	learn: 0.0247036	total: 1.93s	remaining: 395ms
83:	learn: 0.0236941	total: 1.95s	remaining: 372ms
84:	learn: 0.0235769	total: 1.98s	remaining: 349ms
85:	learn: 0.0234891	total: 2s	remaining: 325ms
86:	learn: 0.0233983	total: 2.02s	remaining: 302ms
87:	learn: 0.0229775	total: 2.04s	remaining: 278ms
88:	learn: 0.0222823	total: 2.06s	remaining: 254ms
89:	learn: 0.0218272	total: 2.08s	remaining: 231ms
90:	learn: 0.0213133	total: 2.1s	remaining: 208ms
91:	learn: 0.0209254	total: 2.12s	remaining: 184ms
92:	learn: 0.0194585	total: 2.14s	remaining: 161ms
93:	learn: 0.0193712	total: 2.16s	remaining: 138ms
94:	learn: 0.0191785	total: 2.18s	remaining: 115ms
95:	learn: 0.0183353	total: 2.21s	remaining: 92.1ms
96:	learn: 0.0179554	total: 2.23s	remaining: 69ms
97:	learn: 0.0179211	total: 2.25s	remaining: 46ms
98:	learn: 0.0175292	total: 2.28s	remaining: 23ms
99:	learn: 0.0171780	total: 2.31s	remaining: 0us
[17:47:15] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Out[64]:
VotingClassifier(estimators=[('Gradient Boosting Classifier',
                              GradientBoostingClassifier()),
                             ('Cat Boost Classifier',
                              <catboost.core.CatBoostClassifier object at 0x000001B385B84D00>),
                             ('XGboost',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
                                            gpu_id=-1, importance_type='gain',
                                            interaction_constrain...
                                            tree_method='exact',
                                            validate_parameters=1,
                                            verbosity=None)),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Extra Tree', ExtraTreesClassifier()),
                             ('Light Gradient',
                              LGBMClassifier(learning_rate=1)),
                             ('Random Forest', RandomForestClassifier()),
                             ('Ada Boost',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
                             ('Logistic', LogisticRegression()),
                             ('Knn', KNeighborsClassifier())])
In [66]:
y_pred_vc = vc.predict(X_test)

acc_vtc = accuracy_score(y_test, y_pred_vc)
conf = confusion_matrix(y_test, y_pred_vc)
clf_report = classification_report(y_test, y_pred_vc)

print(f"Accuracy Score of Voting Classifier is : {acc_vtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Voting Classifier is : 0.9634818108100551
Confusion Matrix : 
[[22417    21]
 [ 1285 12040]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     22438
           1       1.00      0.90      0.95     13325

    accuracy                           0.96     35763
   macro avg       0.97      0.95      0.96     35763
weighted avg       0.97      0.96      0.96     35763

  1. ANN
In [70]:
from tensorflow.keras.utils import to_categorical

X = pd.concat([cat_df, num_df], axis = 1)
y = to_categorical(df['is_canceled'])
In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
In [72]:
import keras
from keras.layers import Dense
from keras.models import Sequential

model  = Sequential()
model.add(Dense(100, activation = 'relu', input_shape = (26, )))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(2, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_history = model.fit(X_train, y_train, validation_data = (X_test, y_test),
                          epochs = 100)
Epoch 1/100
2608/2608 [==============================] - 22s 1ms/step - loss: 0.4372 - accuracy: 0.8078 - val_loss: 0.2424 - val_accuracy: 0.9166
Epoch 2/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.2014 - accuracy: 0.9298 - val_loss: 0.1293 - val_accuracy: 0.9554
Epoch 3/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.1282 - accuracy: 0.9579 - val_loss: 0.1195 - val_accuracy: 0.9601
Epoch 4/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.1020 - accuracy: 0.9672 - val_loss: 0.0918 - val_accuracy: 0.9716
Epoch 5/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0833 - accuracy: 0.9730 - val_loss: 0.0787 - val_accuracy: 0.9737
Epoch 6/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0771 - accuracy: 0.9755 - val_loss: 0.0753 - val_accuracy: 0.9748
Epoch 7/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0710 - accuracy: 0.9772 - val_loss: 0.0842 - val_accuracy: 0.9774
Epoch 8/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0670 - accuracy: 0.9783 - val_loss: 0.0626 - val_accuracy: 0.9796
Epoch 9/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0555 - accuracy: 0.9822 - val_loss: 0.0626 - val_accuracy: 0.9836
Epoch 10/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0577 - accuracy: 0.9821 - val_loss: 0.0499 - val_accuracy: 0.9859
Epoch 11/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0522 - accuracy: 0.9835 - val_loss: 0.0531 - val_accuracy: 0.9838
Epoch 12/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0525 - accuracy: 0.9831 - val_loss: 0.0871 - val_accuracy: 0.9756
Epoch 13/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0520 - accuracy: 0.9837 - val_loss: 0.0444 - val_accuracy: 0.9870
Epoch 14/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0473 - accuracy: 0.9851 - val_loss: 0.0654 - val_accuracy: 0.9832
Epoch 15/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0452 - accuracy: 0.9859 - val_loss: 0.0476 - val_accuracy: 0.9870
Epoch 16/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0485 - accuracy: 0.9846 - val_loss: 0.0577 - val_accuracy: 0.9825
Epoch 17/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0428 - accuracy: 0.9867 - val_loss: 0.1203 - val_accuracy: 0.9744
Epoch 18/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0443 - accuracy: 0.9861 - val_loss: 0.0584 - val_accuracy: 0.9846
Epoch 19/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0389 - accuracy: 0.9879 - val_loss: 0.0413 - val_accuracy: 0.9865
Epoch 20/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0397 - accuracy: 0.9880 - val_loss: 0.0390 - val_accuracy: 0.9893
Epoch 21/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0364 - accuracy: 0.9888 - val_loss: 0.0431 - val_accuracy: 0.9869
Epoch 22/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0401 - accuracy: 0.9874 - val_loss: 0.0535 - val_accuracy: 0.9839
Epoch 23/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0384 - accuracy: 0.9882 - val_loss: 0.0459 - val_accuracy: 0.9864
Epoch 24/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0377 - accuracy: 0.9885 - val_loss: 0.0550 - val_accuracy: 0.9836
Epoch 25/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0383 - accuracy: 0.9875 - val_loss: 0.0673 - val_accuracy: 0.9808
Epoch 26/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0373 - accuracy: 0.9881 - val_loss: 0.0355 - val_accuracy: 0.9884
Epoch 27/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0299 - accuracy: 0.9907 - val_loss: 0.0398 - val_accuracy: 0.9879
Epoch 28/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0336 - accuracy: 0.9892 - val_loss: 0.0333 - val_accuracy: 0.9901
Epoch 29/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0311 - accuracy: 0.9902 - val_loss: 0.0395 - val_accuracy: 0.9878
Epoch 30/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0351 - accuracy: 0.9894 - val_loss: 0.0318 - val_accuracy: 0.9911
Epoch 31/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0303 - accuracy: 0.9905 - val_loss: 0.0404 - val_accuracy: 0.9892
Epoch 32/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0340 - accuracy: 0.9893 - val_loss: 0.0332 - val_accuracy: 0.9898
Epoch 33/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0273 - accuracy: 0.9914 - val_loss: 0.0465 - val_accuracy: 0.9853
Epoch 34/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0323 - accuracy: 0.9896 - val_loss: 0.0452 - val_accuracy: 0.9867
Epoch 35/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0289 - accuracy: 0.9906 - val_loss: 0.0339 - val_accuracy: 0.9911
Epoch 36/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0307 - accuracy: 0.9907 - val_loss: 0.0283 - val_accuracy: 0.9925
Epoch 37/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0279 - accuracy: 0.9914 - val_loss: 0.0314 - val_accuracy: 0.9915
Epoch 38/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0293 - accuracy: 0.9909 - val_loss: 0.0310 - val_accuracy: 0.9920
Epoch 39/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0293 - accuracy: 0.9911 - val_loss: 0.0414 - val_accuracy: 0.9872
Epoch 40/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0293 - accuracy: 0.9905 - val_loss: 0.0381 - val_accuracy: 0.9886
Epoch 41/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0327 - accuracy: 0.9896 - val_loss: 0.0428 - val_accuracy: 0.9886
Epoch 42/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0286 - accuracy: 0.9913 - val_loss: 0.0286 - val_accuracy: 0.9914
Epoch 43/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0272 - accuracy: 0.9915 - val_loss: 0.0362 - val_accuracy: 0.9891
Epoch 44/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0260 - accuracy: 0.9913 - val_loss: 0.0466 - val_accuracy: 0.9868
Epoch 45/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0283 - accuracy: 0.9917 - val_loss: 0.0351 - val_accuracy: 0.9903
Epoch 46/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0268 - accuracy: 0.9917 - val_loss: 0.0437 - val_accuracy: 0.9856
Epoch 47/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0300 - accuracy: 0.9907 - val_loss: 0.0312 - val_accuracy: 0.9911
Epoch 48/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0266 - accuracy: 0.9909 - val_loss: 0.0395 - val_accuracy: 0.9896
Epoch 49/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0297 - accuracy: 0.9904 - val_loss: 0.0362 - val_accuracy: 0.9902
Epoch 50/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0244 - accuracy: 0.9924 - val_loss: 0.0418 - val_accuracy: 0.9874
Epoch 51/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0254 - accuracy: 0.9918 - val_loss: 0.0333 - val_accuracy: 0.9911
Epoch 52/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0246 - accuracy: 0.9919 - val_loss: 0.0338 - val_accuracy: 0.9909
Epoch 53/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0264 - accuracy: 0.9917 - val_loss: 0.0352 - val_accuracy: 0.9906
Epoch 54/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0231 - accuracy: 0.9928 - val_loss: 0.0448 - val_accuracy: 0.9880
Epoch 55/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0247 - accuracy: 0.9920 - val_loss: 0.0454 - val_accuracy: 0.9865
Epoch 56/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0242 - accuracy: 0.9926 - val_loss: 0.0368 - val_accuracy: 0.9901
Epoch 57/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0224 - accuracy: 0.9929 - val_loss: 0.0304 - val_accuracy: 0.9908
Epoch 58/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0226 - accuracy: 0.9928 - val_loss: 0.0341 - val_accuracy: 0.9902
Epoch 59/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0249 - accuracy: 0.9924 - val_loss: 0.0933 - val_accuracy: 0.9817
Epoch 60/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0238 - accuracy: 0.9924 - val_loss: 0.0305 - val_accuracy: 0.9914
Epoch 61/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0230 - accuracy: 0.9927 - val_loss: 0.0263 - val_accuracy: 0.9930
Epoch 62/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0217 - accuracy: 0.9931 - val_loss: 0.0370 - val_accuracy: 0.9909
Epoch 63/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0244 - accuracy: 0.9928 - val_loss: 0.0496 - val_accuracy: 0.9875
Epoch 64/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0218 - accuracy: 0.9930 - val_loss: 0.0478 - val_accuracy: 0.9874
Epoch 65/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0220 - accuracy: 0.9930 - val_loss: 0.1135 - val_accuracy: 0.9793
Epoch 66/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0194 - accuracy: 0.9941 - val_loss: 0.0439 - val_accuracy: 0.9888
Epoch 67/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0282 - accuracy: 0.9913 - val_loss: 0.0385 - val_accuracy: 0.9908
Epoch 68/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0198 - accuracy: 0.9935 - val_loss: 0.0408 - val_accuracy: 0.9900
Epoch 69/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0225 - accuracy: 0.9928 - val_loss: 0.0441 - val_accuracy: 0.9871
Epoch 70/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0191 - accuracy: 0.9934 - val_loss: 0.0318 - val_accuracy: 0.9916
Epoch 71/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0205 - accuracy: 0.9935 - val_loss: 0.0784 - val_accuracy: 0.9845
Epoch 72/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0227 - accuracy: 0.9927 - val_loss: 0.0359 - val_accuracy: 0.9916
Epoch 73/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0210 - accuracy: 0.9932 - val_loss: 0.0353 - val_accuracy: 0.9919
Epoch 74/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0189 - accuracy: 0.9940 - val_loss: 0.0450 - val_accuracy: 0.9866
Epoch 75/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0220 - accuracy: 0.9927 - val_loss: 0.0312 - val_accuracy: 0.9914
Epoch 76/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0212 - accuracy: 0.9934 - val_loss: 0.0320 - val_accuracy: 0.9916
Epoch 77/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0196 - accuracy: 0.9939 - val_loss: 0.0367 - val_accuracy: 0.9901
Epoch 78/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0220 - accuracy: 0.9928 - val_loss: 0.0801 - val_accuracy: 0.9843
Epoch 79/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0232 - accuracy: 0.9934 - val_loss: 0.0361 - val_accuracy: 0.9902
Epoch 80/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0162 - accuracy: 0.9946 - val_loss: 0.0311 - val_accuracy: 0.9922
Epoch 81/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0230 - accuracy: 0.9934 - val_loss: 0.0353 - val_accuracy: 0.9897
Epoch 82/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0212 - accuracy: 0.9934 - val_loss: 0.0458 - val_accuracy: 0.9895
Epoch 83/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0197 - accuracy: 0.9943 - val_loss: 0.0349 - val_accuracy: 0.9921
Epoch 84/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0182 - accuracy: 0.9942 - val_loss: 0.0322 - val_accuracy: 0.9925
Epoch 85/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0174 - accuracy: 0.9949 - val_loss: 0.0384 - val_accuracy: 0.9893
Epoch 86/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0259 - accuracy: 0.9918 - val_loss: 0.0384 - val_accuracy: 0.9902
Epoch 87/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0202 - accuracy: 0.9934 - val_loss: 0.0363 - val_accuracy: 0.9915
Epoch 88/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0201 - accuracy: 0.9936 - val_loss: 0.0332 - val_accuracy: 0.9926
Epoch 89/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0162 - accuracy: 0.9946 - val_loss: 0.0557 - val_accuracy: 0.9876
Epoch 90/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0159 - accuracy: 0.9949 - val_loss: 0.0390 - val_accuracy: 0.9900
Epoch 91/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0207 - accuracy: 0.9937 - val_loss: 0.0507 - val_accuracy: 0.9881
Epoch 92/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0172 - accuracy: 0.9946 - val_loss: 0.0399 - val_accuracy: 0.9895
Epoch 93/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0199 - accuracy: 0.9934 - val_loss: 0.0532 - val_accuracy: 0.9895
Epoch 94/100
2608/2608 [==============================] - 4s 1ms/step - loss: 0.0172 - accuracy: 0.9945 - val_loss: 0.0448 - val_accuracy: 0.9910
Epoch 95/100
2608/2608 [==============================] - 4s 2ms/step - loss: 0.0185 - accuracy: 0.9937 - val_loss: 0.0377 - val_accuracy: 0.9904
Epoch 96/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0158 - accuracy: 0.9952 - val_loss: 0.0360 - val_accuracy: 0.9918
Epoch 97/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0171 - accuracy: 0.9946 - val_loss: 0.0685 - val_accuracy: 0.9769
Epoch 98/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0147 - accuracy: 0.9954 - val_loss: 0.0367 - val_accuracy: 0.9912
Epoch 99/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0168 - accuracy: 0.9948 - val_loss: 0.0360 - val_accuracy: 0.9922
Epoch 100/100
2608/2608 [==============================] - 3s 1ms/step - loss: 0.0173 - accuracy: 0.9942 - val_loss: 0.0407 - val_accuracy: 0.9916
In [73]:
plt.figure(figsize = (12, 6))

train_loss = model_history.history['loss']
val_loss = model_history.history['val_loss'] 
epoch = range(1, 101)

loss = pd.DataFrame({'train_loss' : train_loss, 'val_loss' : val_loss})

px.line(data_frame = loss, x = epoch, y = ['val_loss', 'train_loss'], title = 'Training and Validation Loss',
        template = 'plotly_dark')
<Figure size 864x432 with 0 Axes>
In [74]:
plt.figure(figsize = (12, 6))

train_acc = model_history.history['accuracy']
val_acc = model_history.history['val_accuracy'] 
epoch = range(1, 101)


accuracy = pd.DataFrame({'train_acc' : train_acc, 'val_acc' : val_acc})

px.line(data_frame = accuracy, x = epoch, y = ['val_acc', 'train_acc'], title = 'Training and Validation Accuracy',
        template = 'plotly_dark')
<Figure size 864x432 with 0 Axes>
In [75]:
acc_ann = model.evaluate(X_test, y_test)[1]

print(f'Accuracy of model is {acc_ann}')
1118/1118 [==============================] - 1s 642us/step - loss: 0.0407 - accuracy: 0.9916
Accuracy of model is 0.9916393756866455

Model Comparison

In [76]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
             'Gradient Boosting Classifier', 'XgBoost', 'Cat Boost', 'Extra Trees Classifier', 'LGBM', 'Voting Classifier'
               , 'ANN'],
    'Score' : [acc_lr, acc_knn, acc_dtc, acc_rd_clf, acc_ada, acc_gb, acc_xgb, acc_cat, acc_etc, acc_lgbm, acc_vtc, acc_ann]
})


models.sort_values(by = 'Score', ascending = False)
Out[76]:
Model Score
7 Cat Boost 0.993988
11 ANN 0.991639
6 XgBoost 0.980147
10 Voting Classifier 0.963482
9 LGBM 0.962978
3 Random Forest Classifier 0.953975
8 Extra Trees Classifier 0.951458
2 Decision Tree Classifier 0.948410
4 Ada Boost Classifier 0.948299
5 Gradient Boosting Classifier 0.907306
1 KNN 0.889187
0 Logistic Regression 0.808769
In [77]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', title = 'Models Comparison')
In [ ]: